When removing the gene ARHGAP11B (ENSG00000187951) in the Gandal dataset there was an important change in the WGCNA modules, and when analysing what could have caused this, we found that this gene had a very big outlier value for a single sample, which we think altered the values of the other genes during the vst transformation enough to alter the modules by the WGNCA algorithm.
Since the preprocessing pipeline did not catch this behaviour and it ended up causing problems in downstream analysis, we want to see how often we can find these type of behaviours in the data, if/how much they alter the results of WGCNA modules, and if it was just a behaviour specific to the Gandal dataset or if it can be found in others as well.
Note: This ARHGAP11B gene is not in this dataset
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(WGCNA)
library(expss)
library(knitr)
library(polycor)
# Load preprocessed data to get DE information and the genes that weren't filtered during preprocessing
load('./../../AllRegions/Data/filtered_raw_data.RData')
datExpr_filtered = datExpr
datGenes_filtered = datGenes
datMeta_filtered = datMeta
load('./../../AllRegions/Data/preprocessed_data.RData')
DE_info = DE_info %>% data.frame %>% mutate('ID' = rownames(datExpr), 'DE' = padj<0.05) %>%
dplyr::select(ID, log2FoldChange, padj, DE)
# Load original expression datasets
datExpr = read.delim('./../Data/datExpr.csv', row.names=1)
rownames(datExpr) = substr(rownames(datExpr), 1, 15)
# Filtering genes: These filters would be the same, so I'll just keep the genes present in the
# filtered dataset instead of repeating the whole filtering
datExpr = datExpr[rownames(datExpr) %in% rownames(datExpr_filtered),colnames(datExpr) %in% datMeta$ID]
datGenes = datGenes_filtered %>% filter(ensembl_gene_id %in% rownames(datExpr_filtered))
rm(datExpr_filtered, datGenes_filtered, dds)
\(metric_i = max_j \frac{|x_{i,j} - mean(x_i)|}{sd(x_i)}\)
z_scores = datExpr %>% apply(1, function(x) abs(x-mean(x))/sd(x)) %>% t %>% data.frame
Genes with the highest z-score in any of its entries
max_z_scores = data.frame('ID' = rownames(z_scores), 'max_z_score' = z_scores %>% apply(1, max),
'outlier_sample' = z_scores %>% apply(1, function(x) datMeta$ID[which.max(x)])) %>%
left_join(DE_info, by = 'ID')
summary(max_z_scores$max_z_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.559 4.817 5.889 5.901 6.918 10.089
p = max_z_scores %>% ggplot(aes(ID, max_z_score+1, color = DE)) + geom_point(alpha=0.3) +
xlab('Genes') + ylab('Max |Z-score|') + theme_minimal() + scale_y_log10() +
ggtitle('Max|z-score| value for each gene') +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
legend.position='none', panel.grid.major = element_blank())
ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)
rm(p)
Top 20 genes
These genes have usually a signle outlier sample
Most of the outliers correspond to ASD samples but there are some from Control samples as well
This values don’t seem to be as extreme as the ones found in the Gandal dataset
Standardising the max_z_score value of each gene, all of the 20 top genes would be considered outliers (more than 2 sd from the average max_z_score value)
plot_function = function(i){
i = 3*i-2
plot_list = list()
for(j in 1:3){
plot_data = data.frame('sample' = colnames(datExpr), 'expr' = t(datExpr[top_genes$ID[i+j-1],]),
'Diagnosis' = datMeta$Diagnosis)
colnames(plot_data)[2] = 'expr'
plot_list[[j]] = ggplotly(plot_data %>% ggplot(aes(sample, expr, color=Diagnosis)) +
geom_point() + theme_minimal() +
theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank()))
}
p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
list(x = 0.1 , y = 1.05, text = top_genes$hgnc_symbol[i], showarrow = F, xref='paper', yref='paper'),
list(x = 0.5 , y = 1.05, text = top_genes$hgnc_symbol[i+1], showarrow = F, xref='paper', yref='paper'),
list(x = 0.9 , y = 1.05, text = top_genes$hgnc_symbol[i+2], showarrow = F, xref='paper', yref='paper')))
return(p)
}
top_genes = max_z_scores %>% dplyr::top_n(n=21, w=max_z_score) %>% arrange(desc(max_z_score)) %>%
left_join(datGenes, by=c('ID' = 'ensembl_gene_id')) %>%
mutate('StandMaxZscore' = (max_z_score-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)) %>%
dplyr::select(ID, hgnc_symbol, log2FoldChange, padj, DE, max_z_score, StandMaxZscore)
kable(top_genes, caption='Top 20 genes with the highest max z-score value')
| ID | hgnc_symbol | log2FoldChange | padj | DE | max_z_score | StandMaxZscore |
|---|---|---|---|---|---|---|
| ENSG00000256618 | MTRNR2L1 | 1.9916981 | 0.0401708 | TRUE | 10.089062 | 3.064010 |
| ENSG00000173110 | HSPA6 | -1.0669636 | 0.2221616 | FALSE | 10.078125 | 3.056009 |
| ENSG00000175592 | FOSL1 | 0.5073645 | 0.6024915 | FALSE | 10.002990 | 3.001047 |
| ENSG00000197893 | NRAP | 0.4561690 | NA | NA | 9.940447 | 2.955295 |
| ENSG00000161298 | ZNF382 | 0.2422453 | NA | NA | 9.932058 | 2.949158 |
| ENSG00000006327 | TNFRSF12A | 0.0166290 | 0.9874780 | FALSE | 9.923768 | 2.943094 |
| ENSG00000155975 | VPS37A | 0.3799827 | 0.1502730 | FALSE | 9.812672 | 2.861825 |
| ENSG00000139629 | GALNT6 | 0.1749755 | 0.5941066 | FALSE | 9.811609 | 2.861047 |
| ENSG00000100362 | PVALB | -0.8270296 | 0.0335412 | TRUE | 9.735750 | 2.805555 |
| ENSG00000142765 | SYTL1 | 0.6009630 | NA | NA | 9.712743 | 2.788725 |
| ENSG00000121410 | A1BG | -0.0890489 | NA | NA | 9.704083 | 2.782390 |
| ENSG00000133800 | LYVE1 | -0.2529720 | 0.6515612 | FALSE | 9.665992 | 2.754526 |
| ENSG00000174576 | NPAS4 | -0.5234097 | 0.5679929 | FALSE | 9.664777 | 2.753637 |
| ENSG00000166246 | C16orf71 | 0.0102824 | NA | NA | 9.631361 | 2.729192 |
| ENSG00000100122 | CRYBB1 | -0.0385638 | 0.9461326 | FALSE | 9.616935 | 2.718640 |
| ENSG00000132518 | GUCY2D | -0.5383085 | NA | NA | 9.607344 | 2.711623 |
| ENSG00000149257 | SERPINH1 | 0.3468448 | 0.5449874 | FALSE | 9.602366 | 2.707982 |
| ENSG00000123119 | NECAB1 | 0.0920081 | 0.7425828 | FALSE | 9.592995 | 2.701127 |
| ENSG00000166278 | C2 | 0.6753946 | 0.0365333 | TRUE | 9.590246 | 2.699116 |
| ENSG00000243284 | VSIG8 | -0.2627459 | NA | NA | 9.586191 | 2.696150 |
| ENSG00000169313 | P2RY12 | -0.2472338 | 0.7498587 | FALSE | 9.573445 | 2.686826 |
plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)
If there was no dependence between the samples and the outliers, then the maximum |z-scores| would be evenly distributed along the 106 samples, with ~130 maximum z-scores in each sample
Defining outlier samples: Samples with a standardised Count > 2 (everything above the dotted line)
samples_info = max_z_scores$outlier_sample %>% table %>% data.frame %>%
dplyr::rename(Sample_ID = '.', Count = 'Freq') %>%
left_join(datMeta, by = c('Sample_ID' = 'ID')) %>% arrange(desc(Count)) %>%
mutate('StandardisedCount' = round((Count-mean(Count))/sd(Count),2)) %>%
dplyr::select(Sample_ID, Count, StandardisedCount, Subject_ID, Gender, Age, brain_lobe, SiteHM, Diagnosis)
ggplotly(samples_info %>% ggplot(aes(Subject_ID, Count, fill=Diagnosis, color=brain_lobe)) +
geom_bar(stat = 'identity', size = 0, position = position_dodge2(preserve = 'single')) + xlab('Subjects') +
geom_hline(yintercept = nrow(max_z_scores)/nrow(datMeta), color = 'gray') +
geom_hline(yintercept = mean(samples_info$Count)+2*sd(samples_info$Count), color = 'gray', linetype = 'dotted') +
ggtitle('Number of genes for which their maximum |z-score| value fell in each Sample') +
theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = 'none'))
kable(samples_info %>% filter(Count>nrow(max_z_scores)/nrow(datMeta)),
caption = 'Samples with more max z-scores than expected by random assignment')
| Sample_ID | Count | StandardisedCount | Subject_ID | Gender | Age | brain_lobe | SiteHM | Diagnosis |
|---|---|---|---|---|---|---|---|---|
| ba19.s69 | 8264 | 9.91 | s69 | F | 18 | Occipital | M | CTL |
| ba19.s7 | 2099 | 2.40 | s7 | M | 39 | Occipital | H | ASD |
| ba10.s72 | 653 | 0.64 | s72 | M | 13 | Frontal | M | CTL |
| ba19.s6 | 523 | 0.48 | s6 | M | 82 | Occipital | H | ASD |
| ba10.s14 | 377 | 0.30 | s14 | M | 2 | Frontal | H | ASD |
| ba10.s11 | 262 | 0.16 | s11 | F | 18 | Frontal | H | ASD |
| ba10.s73 | 234 | 0.13 | s73 | M | 16 | Frontal | M | CTL |
| ba19.s2 | 225 | 0.12 | s2 | M | 27 | Occipital | H | ASD |
| ba10.s85 | 224 | 0.12 | s85 | M | 17 | Frontal | M | CTL |
To see if the samples classified as outliers have a different general behaviour than the other samples (not just on the genes which had their maximum value in them), we calculate the z-score value each gene has on each sample and see their distribution
The outlier samples seem to have a higher distribution of z-scores along all of the genes when comparing them to random samples, not just the one which corresponded to the max|z-score|
plot_data = data.frame('ID' = rownames(datExpr))
outlier_samples = samples_info$Sample_ID[samples_info$Count>mean(samples_info$Count)+2*sd(samples_info$Count)]
# Calculate z-score of each gene for the outlier samples
for(s in outlier_samples){
outlier_idx = which(datMeta$ID == s)
z_scores = apply(datExpr, 1, function(x) (abs(x[outlier_idx]-mean(x)))/sd(x))
plot_data = cbind(plot_data, z_scores)
}
colnames(plot_data)[-1] = as.character(outlier_samples)
# Select random samples for comparison
set.seed(123)
rand_samp_1 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_1 = which(datMeta$ID==rand_samp_1)
set.seed(124)
rand_samp_2 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_2 = which(datMeta$ID==rand_samp_2)
set.seed(125)
rand_samp_3 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_3 = which(datMeta$ID==rand_samp_3)
cat(paste0('Using random samples ', rand_samp_1, ', ', rand_samp_2, ', ', rand_samp_3, ' as a reference'))
## Using random samples ba19.s35, ba19.s75, ba19.s34 as a reference
z_func = function(x, rand_samp_idx) return(abs(x[rand_samp_idx]-mean(x))/sd(x))
# Transform data for plotting
levels = c(unique(as.character(samples_info$Sample_ID[samples_info$StandardisedCount>2])),
'Random Sample 1','Random Sample 2','Random Sample 3')
plot_data_melt = plot_data %>% mutate('Random Sample 1'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_1)),
'Random Sample 2'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_2)),
'Random Sample 3'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_3))) %>%
melt() %>% mutate(variable = factor(variable, levels=levels, ordered=T)) %>%
mutate(ID = case_when(variable == 'Random Sample 1' ~ rand_samp_1,
variable == 'Random Sample 2' ~ rand_samp_2,
variable == 'Random Sample 3' ~ rand_samp_3,
TRUE ~ as.character(variable))) %>%
left_join(datMeta %>% dplyr::select(ID, Diagnosis), by = 'ID')
# Plot
ggplotly(plot_data_melt %>% ggplot(aes(variable, value+1, fill=Diagnosis)) + geom_boxplot() +
xlab('Samples') +ylab('|z-scores|') + scale_y_log10() + theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)))
rm(plot_data, plot_data_melt, s, outlier_idx, z_scores, rand_samp_1, rand_samp_idx_1,
rand_samp_2, rand_samp_idx_2, rand_samp_3, rand_samp_idx_3, levels, z_func)
Currently used metric:
Create weighted sample correlation network
Calculate the connectivity of each node
Normalise connectivity of the nodes (samples)
Filter nodes (samples) with a connectivity lower than -2 (low connectivity, with a distance larger than 2 sd to the mean connectivity of the network)
Notes:
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
original_z_score = data.frame('ID' = gsub('X','',names(z.ku)), 'z_score_orig' = as.vector(z.ku)) %>%
left_join(datMeta, by = 'ID') %>%
dplyr::select(ID, z_score_orig)
plot_data = samples_info %>% left_join(original_z_score, by = c('Sample_ID'='ID')) %>%
mutate('norm_count' = (Count-mean(Count))/sd(Count))
ggplotly(plot_data %>% ggplot(aes(z_score_orig, norm_count, color=Diagnosis)) +
geom_point(alpha=0.8, aes(id=Sample_ID)) +
geom_vline(xintercept = -2, color='gray', linetype = 'dotted') +
geom_hline(yintercept = 2, color='gray', linetype = 'dotted') +
xlab('Currently used metric') + ylab('Standardised number of max |z-score| genes in sample') +
theme_minimal())
rm(absadj, netsummary, ku, z.ku, original_z_score)
This time it looks like the original metric doesn’t agree as mucho with the PCA plot of the genes, but our method doesn’t do a very good job either
pca = datExpr %>% t %>% prcomp
pca_data = data.frame('ID' = gsub('X','',colnames(datExpr)), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(datMeta, by='ID') %>%
dplyr::select(ID, PC1, PC2) %>% left_join(plot_data, by = c('ID'='Sample_ID'))
p1 = ggplotly(pca_data %>% ggplot(aes(PC1, PC2, color=-StandardisedCount)) +
geom_point(alpha=0.8, aes(id=ID)) +
xlab(paste0('PC1 (', round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (', round(100*summary(pca)$importance[2,2],1),'%)')) +
scale_color_viridis() + theme_minimal() + theme(legend.position = 'none') +
coord_fixed())
p2 = ggplotly(pca_data %>% ggplot(aes(PC1, PC2, color=z_score_orig)) +
geom_point(alpha=0.8, aes(id=ID)) +
xlab(paste0('PC1 (', round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (', round(100*summary(pca)$importance[2,2],1),'%)')) +
ggtitle('Outlier genes in each sample (left) and Original sample outlier metric (right)') +
scale_color_viridis() +
theme_minimal() + theme(legend.position = 'none') + coord_fixed())
subplot(p1, p2, nrows=1)
rm(pca, pca_data, p1, p2)
\(metric_i = max_j \frac{|x_{i,j} - median(x_i)|}{mad(x_i)}\)
z_scores = datExpr %>% apply(1, function(x) abs(x-median(x))/mad(x)) %>% t %>% data.frame
Genes with the highest z-score in any of their entries
max_z_scores = data.frame('ID' = rownames(z_scores), 'max_z_score' = z_scores %>% apply(1, max),
'outlier_sample' = z_scores %>% apply(1, function(x) datMeta$ID[which.max(x)])) %>%
left_join(DE_info, by = 'ID')
The dotted line indicates the value for the gene we removed ARHGAP11B (ENSG00000187951)
summary(max_z_scores$max_z_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.83 8.08 11.69 14.49 17.17 2179.45
p = max_z_scores %>% ggplot(aes(ID, max_z_score+1, color = DE)) + geom_point(alpha=0.3) +
xlab('Genes') + ylab('Max |Z-score|') + theme_minimal() + scale_y_log10() +
ggtitle('Max|z-score| value for each gene') +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
legend.position='none', panel.grid.major = element_blank())
ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)
rm(p)
Top 20 genes
These genes have more than one outlier sample
Most of the outliers correspond to ASD samples but it’s not as extreme as with the Gandal dataset
There no longer seem to be a lot of DE genes
The standardised max_z_score values of each of the top genes are huge! Probably because of the really long right tail of this metric’s distribution
plot_function = function(i){
i = 3*i-2
plot_list = list()
for(j in 1:3){
plot_data = data.frame('sample' = colnames(datExpr), 'expr' = t(datExpr[top_genes$ID[i+j-1],]),
'Diagnosis' = datMeta$Diagnosis)
colnames(plot_data)[2] = 'expr'
plot_list[[j]] = ggplotly(plot_data %>% ggplot(aes(sample, expr, color=Diagnosis)) +
geom_point() + theme_minimal() +
theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank()))
}
p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
list(x = 0.1 , y = 1.05, text = top_genes$hgnc_symbol[i], showarrow = F, xref='paper', yref='paper'),
list(x = 0.5 , y = 1.05, text = top_genes$hgnc_symbol[i+1], showarrow = F, xref='paper', yref='paper'),
list(x = 0.9 , y = 1.05, text = top_genes$hgnc_symbol[i+2], showarrow = F, xref='paper', yref='paper')))
return(p)
}
top_genes = max_z_scores %>% dplyr::top_n(n=21, w=max_z_score) %>% arrange(desc(max_z_score)) %>%
left_join(datGenes, by=c('ID' = 'ensembl_gene_id')) %>%
mutate('StandMaxZscore' = (max_z_score-mean(max_z_scores$max_z_score))/sd(max_z_scores$max_z_score)) %>%
dplyr::select(ID, hgnc_symbol, log2FoldChange, padj, DE, max_z_score, StandMaxZscore)
kable(top_genes, caption='Top 20 genes with the highest max z-score value')
| ID | hgnc_symbol | log2FoldChange | padj | DE | max_z_score | StandMaxZscore |
|---|---|---|---|---|---|---|
| ENSG00000173110 | HSPA6 | -1.0669636 | 0.2221616 | FALSE | 2179.4483 | 73.587340 |
| ENSG00000256618 | MTRNR2L1 | 1.9916981 | 0.0401708 | TRUE | 1675.1691 | 56.446812 |
| ENSG00000196136 | SERPINA3 | 1.4141957 | 0.0716810 | FALSE | 1632.2280 | 54.987237 |
| ENSG00000174576 | NPAS4 | -0.5234097 | 0.5679929 | FALSE | 685.5044 | 22.807955 |
| ENSG00000175592 | FOSL1 | 0.5073645 | 0.6024915 | FALSE | 300.8229 | 9.732572 |
| ENSG00000108691 | CCL2 | -0.7728324 | 0.3445928 | FALSE | 192.2299 | 6.041479 |
| ENSG00000026508 | CD44 | 0.6639284 | 0.2819296 | FALSE | 188.7825 | 5.924301 |
| ENSG00000255823 | MTRNR2L8 | -1.0174587 | 0.2073937 | FALSE | 165.1880 | 5.122321 |
| ENSG00000104951 | IL4I1 | 0.6933161 | 0.3081869 | FALSE | 161.6248 | 5.001209 |
| ENSG00000006327 | TNFRSF12A | 0.0166290 | 0.9874780 | FALSE | 147.7135 | 4.528360 |
| ENSG00000198502 | HLA-DRB5 | 1.0225666 | 0.2127074 | FALSE | 147.2638 | 4.513076 |
| ENSG00000162645 | GBP2 | 0.7160785 | 0.1557531 | FALSE | 130.0482 | 3.927916 |
| ENSG00000133048 | CHI3L1 | 0.9654080 | 0.0259421 | TRUE | 127.7419 | 3.849521 |
| ENSG00000025708 | TYMP | 0.8388151 | 0.0179278 | TRUE | 119.3012 | 3.562622 |
| ENSG00000155975 | VPS37A | 0.3799827 | 0.1502730 | FALSE | 118.1903 | 3.524862 |
| ENSG00000139629 | GALNT6 | 0.1749755 | 0.5941066 | FALSE | 115.9066 | 3.447239 |
| ENSG00000169313 | P2RY12 | -0.2472338 | 0.7498587 | FALSE | 113.7828 | 3.375052 |
| ENSG00000197249 | SERPINA1 | 0.7627801 | 0.0788746 | FALSE | 112.9367 | 3.346293 |
| ENSG00000100362 | PVALB | -0.8270296 | 0.0335412 | TRUE | 111.2947 | 3.290480 |
| ENSG00000163682 | RPL9 | 0.0460735 | 0.9473003 | FALSE | 109.2164 | 3.219838 |
| ENSG00000143546 | S100A8 | 0.6803657 | 0.4131652 | FALSE | 108.6543 | 3.200733 |
plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)
The distribution of the genes with the max |z-score| is exactly the same as in the previous section because the two metrics are a monotonic transformation of the entries by row (by gene), so the maximum of each row is reached in the same sample in both metrics
To see if the samples classified as outliers have a different general behaviour than the other samples (not just on the genes which had their maximum value in them), we calculate the z-score value each gene has on each sample and see their distribution.
The outlier samples seem to have a higher distribution of z-scores along all of the genes when comparing them to random samples, not just the one which corresponded to the max|z-score|
The results are very similar to the ones from the first metric
plot_data = data.frame('ID' = rownames(datExpr))
outlier_samples = samples_info$Sample_ID[samples_info$Count>mean(samples_info$Count)+2*sd(samples_info$Count)]
# Calculate z-score of each gene for the outlier samples
for(s in outlier_samples){
outlier_idx = which(datMeta$ID == s)
z_scores = apply(datExpr, 1, function(x) (abs(x[outlier_idx]-mean(x)))/sd(x))
plot_data = cbind(plot_data, z_scores)
}
colnames(plot_data)[-1] = as.character(outlier_samples)
# Select random samples for comparison
set.seed(123)
rand_samp_1 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_1 = which(datMeta$ID==rand_samp_1)
set.seed(124)
rand_samp_2 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_2 = which(datMeta$ID==rand_samp_2)
set.seed(125)
rand_samp_3 = sample(datMeta$ID[! datMeta$ID %in% outlier_samples],1) %>% as.character
rand_samp_idx_3 = which(datMeta$ID==rand_samp_3)
cat(paste0('Using random samples ', rand_samp_1, ', ', rand_samp_2, ', ', rand_samp_3, ' as a reference'))
## Using random samples ba19.s35, ba19.s75, ba19.s34 as a reference
z_func = function(x, rand_samp_idx) return(abs(x[rand_samp_idx]-mean(x))/sd(x))
# Transform data for plotting
levels = c(unique(as.character(samples_info$Sample_ID[samples_info$StandardisedCount>2])),
'Random Sample 1','Random Sample 2','Random Sample 3')
plot_data_melt = plot_data %>% mutate('Random Sample 1'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_1)),
'Random Sample 2'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_2)),
'Random Sample 3'=apply(datExpr,1,function(x) z_func(x, rand_samp_idx_3))) %>%
melt() %>% mutate(variable = factor(variable, levels=levels, ordered=T)) %>%
mutate(ID = case_when(variable == 'Random Sample 1' ~ rand_samp_1,
variable == 'Random Sample 2' ~ rand_samp_2,
variable == 'Random Sample 3' ~ rand_samp_3,
TRUE ~ as.character(variable))) %>%
left_join(datMeta %>% dplyr::select(ID, Diagnosis), by = 'ID')
# Plot
ggplotly(plot_data_melt %>% ggplot(aes(variable, value+1, fill=Diagnosis)) + geom_boxplot() +
xlab('Samples') +ylab('|z-scores|') + scale_y_log10() + theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)))
rm(plot_data, plot_data_melt, s, outlier_idx, z_scores, rand_samp_1, rand_samp_idx_1,
rand_samp_2, rand_samp_idx_2, rand_samp_3, rand_samp_idx_3, levels, z_func)
The plot is exactly the same as with the first metric, since the maximums by row happen in the same columns, so the counts by sample are the same in both methods
We know that when using the metrics for sample outlier detection they give exactly the same results, but they seem to vary greatly when using them for gene outlier detection
Using a distance of 3 standard deviations to define outliers in both methods:
plot_data = data.frame('ID' = rownames(datExpr),
'orig_z_score' = apply(datExpr, 1, function(x) max(abs(x-mean(x))/sd(x))),
'robust_z_score' = apply(datExpr, 1, function(x) max(abs(x-median(x))/mad(x)))) %>%
mutate(outliers_orig_z_score = orig_z_score > mean(orig_z_score) + 3*sd(orig_z_score),
outliers_robust_z_score = robust_z_score > mean(robust_z_score) + 3*sd(robust_z_score),
alpha = ifelse(ID == 'ENSG00000187951', 1, 0.2)) %>%
mutate(Outliers = case_when(outliers_orig_z_score & outliers_robust_z_score ~ 'Both',
outliers_orig_z_score & !outliers_robust_z_score ~ 'Only Original',
!outliers_orig_z_score & outliers_robust_z_score ~ 'Only Robust',
TRUE ~ 'Neither')) %>%
mutate(Outliers = factor(Outliers, levels = c('Neither','Only Original', 'Only Robust', 'Both')))
plot_data %>% ggplot(aes(orig_z_score, robust_z_score, color=Outliers)) + geom_point(alpha=plot_data$alpha) +
geom_vline(xintercept = mean(plot_data$orig_z_score) + 3*sd(plot_data$orig_z_score),
color = 'gray', linetype = 'dashed') +
geom_hline(yintercept = mean(plot_data$robust_z_score) + 3*sd(plot_data$robust_z_score),
color = 'gray', linetype = 'dashed') +
geom_smooth(method = 'gam', se = FALSE, color='gray') +
ggtitle(paste0('R^2 = ', round(cor(plot_data$orig_z_score, plot_data$robust_z_score)[[1]]^2,2))) +
scale_y_log10() + scale_color_viridis_d() + xlab('Original max|z-score|') +
ylab('Robust max|z-score|') + theme_minimal()
table_info = plot_data %>% apply_labels(outliers_orig_z_score = 'Outliers using Original Metric',
outliers_robust_z_score = 'Outliers using Robust Metric')
cro(table_info$outliers_orig_z_score, list(table_info$outliers_robust_z_score, total()))
| Outliers using Robust Metric | #Total | |||
|---|---|---|---|---|
| FALSE | TRUE | |||
| Outliers using Original Metric | ||||
| FALSE | 13676 | 19 | 13695 | |
| TRUE | 3 | 3 | ||
| #Total cases | 13676 | 22 | 13698 | |
rm(table_info)
Repeating the plot above but colouring genes depending on whether they are DE or not:
plot_data = data.frame('ID' = rownames(datExpr),
'orig_z_score' = apply(datExpr, 1, function(x) max(abs(x-mean(x))/sd(x))),
'robust_z_score' = apply(datExpr, 1, function(x) max(abs(x-median(x))/mad(x))),
'meanExpr' = log2(rowMeans(datExpr))) %>%
left_join(DE_info, by = 'ID') %>%
mutate(outliers_orig_z_score = orig_z_score > mean(orig_z_score) + 3*sd(orig_z_score),
outliers_robust_z_score = robust_z_score > mean(robust_z_score) + 3*sd(robust_z_score),
alpha = ifelse(ID == 'ENSG00000187951', 1, 0.2))
plot_data %>% ggplot(aes(orig_z_score, robust_z_score, color=DE)) + geom_point(alpha=plot_data$alpha) +
geom_vline(xintercept = mean(plot_data$orig_z_score) + 3*sd(plot_data$orig_z_score),
color = 'gray', linetype = 'dashed') +
geom_hline(yintercept = mean(plot_data$robust_z_score) + 3*sd(plot_data$robust_z_score),
color = 'gray', linetype = 'dashed') +
geom_smooth(method = 'gam', se = FALSE) +
ggtitle(paste0('R^2 = ', round(cor(plot_data$orig_z_score, plot_data$robust_z_score)[[1]]^2,2))) +
scale_y_log10() + xlab('Original max|z-score|') + ylab('Robust max|z-score|') + theme_minimal()
rm(table_info)
If we order the genes by their max|z-score| and calculate the percentage of DE genes using a rolling average, we can see if there’s a change in this percentage of DE genes for different levels of the metric:
In general, as the scores increase, the percentage of DE genes decreases, which makes sense, since the score is measuring a certain kind of noise
There is a small increase in the genes with the highest scores in both methods, but it’s quite small
Note: The genes in this plot are ordered by their max|z-score| value, so there are actually two orderings of genes in the plot, one for the original metric and the other for the robust metric. That’s why there are different patterns in the % of DE genes for each metric
w = 2000
sliding_DE = data.frame('window' = 1:(nrow(datExpr)-w), 'original' = 0, 'robust' = 0)
original_sort = plot_data %>% arrange(orig_z_score)
robust_sort = plot_data %>% arrange(robust_z_score)
for(i in 1:nrow(sliding_DE)){
sliding_DE$original[i] = mean(original_sort$DE[i:(i+w)], na.rm = TRUE)
sliding_DE$robust[i] = mean(robust_sort$DE[i:(i+w)], na.rm = TRUE)
}
sliding_DE %>% melt(id.vars='window') %>% dplyr::rename(Metric = variable) %>%
ggplot(aes(window, 100*value, color=Metric)) + geom_line(alpha=0.3) +
geom_hline(yintercept = 100*mean(plot_data$DE, na.rm=TRUE), color = 'gray') +
geom_smooth(se=FALSE) + ylab('% DE genes in window') +
scale_color_manual(values = c('#e60073','#008080')) + theme_minimal()
rm(w, sliding_DE, original_sort, robust_sort, i)
If we plot now the rolling mean of the |LFC|:
For both metrics the LFC increases as the value of the metric increases (this makes sense because noisier samples would give bigger differences)
The |LFC| increases more in the robust version of the metric than in the original
w = 2000
sliding_DE = data.frame('window' = 1:(nrow(datExpr)-w), 'original' = 0, 'robust' = 0)
original_sort = plot_data %>% arrange(orig_z_score)
robust_sort = plot_data %>% arrange(robust_z_score)
for(i in 1:nrow(sliding_DE)){
sliding_DE$original[i] = mean(abs(original_sort$log2FoldChange[i:(i+w)]), na.rm = TRUE)
sliding_DE$robust[i] = mean(abs(robust_sort$log2FoldChange[i:(i+w)]), na.rm = TRUE)
}
sliding_DE %>% melt(id.vars='window') %>% dplyr::rename(Metric = variable) %>%
ggplot(aes(window, 100*value, color=Metric)) + geom_line(alpha=0.3) +
geom_hline(yintercept = 100*mean(abs(plot_data$log2FoldChange), na.rm=TRUE), color = 'gray') +
geom_smooth(se=FALSE) + ylab('Mean LFC of genes in window') +
scale_color_manual(values = c('#e60073','#008080')) + theme_minimal()
rm(w, sliding_DE, original_sort, robust_sort, i)
For the first half of the plot (~windows 0-6000): |LFC| remains roughly constant but %DE decreases steadily
Even though the change between Diagnoses remains the same, the increase in outlier samples (reflected in the position of the genes in a higher window) decreases the significance of the comparison, resulting in higher (and no longer significant) p-values
This means that the presence of outliers decreases the confidence of the DEA
For the second half of the plot (~windows 6000-1200): The %DE genes continues decreasing but in a less steep manner remains constant using the original metric but increases (maybe exponentially?) using the robust metric. At the same time, the |LFC| increases steadily on both metrics, but at a higher rate in the robust metric
For the Original metric we can give a similar conclusion as in the first half of the plot: The more extreme the outliers of a gene are, the more they damage the confidence of the DEA, resulting in genes with very high |LFC| that share the same level of confidence with genes with a not so extreme |LFC|
For the Robust metric, both the % of DE genes and the |LFC| increase, which sounds reasonable. The only problem with this is that we don’t know if this increase in DE genes is because of biological signals which are being mistaken for technical effects by the max|z-score|, or technical effects which are confused by biological signals by DESeq2. This problem is specific to this dataset, since consistently, the outlier values correspond to ASD samples
If we plot the rolling mean of the mean level of expression of the genes:
The level of experssion plays an important role in both of the scores (same pattern found in Gandal)
Genes with lower mean expression tend to have larger max|z-score| values
w = 1000
sliding_DE = data.frame('window' = 1:(nrow(datExpr)-w), 'original' = 0, 'robust' = 0)
original_sort = plot_data %>% arrange(orig_z_score)
robust_sort = plot_data %>% arrange(robust_z_score)
for(i in 1:nrow(sliding_DE)){
sliding_DE$original[i] = mean(original_sort$meanExpr[i:(i+w)], na.rm = TRUE)
sliding_DE$robust[i] = mean(robust_sort$meanExpr[i:(i+w)], na.rm = TRUE)
}
sliding_DE %>% melt(id.vars='window') %>% dplyr::rename(Metric = variable) %>%
ggplot(aes(window, 100*value, color=Metric)) + geom_line(alpha=0.3) +
geom_hline(yintercept = 100*mean(plot_data$meanExpr, na.rm=TRUE), color = 'gray') +
geom_smooth(se=FALSE) + ylab('Mean Mean Level of Expression in window') +
scale_color_manual(values = c('#e60073','#008080')) + theme_minimal()
rm(w, sliding_DE, original_sort, robust_sort, i)
When using these methods to identify outlier samples, they give exactly the same results because they are both a monotonic transformation by rows of the counts data, so the maximums are achieved in the same positions (samples) in both cases
When using these methods to identify outlier genes, they give different results, the first one identifying genes with a single, huge outlier, and the second one identifying genes with multiple big outliers
I think the first metric could be better since a single huge outlier is almost certainly a technical error and not a biological effect, but this distinction would be more difficult to distinguish with the second metric, where it identifies multiple outliers (I think the linear fits on the plot above proves that this could be happening up to some extent)
Could the relation between the robust metric and DE genes be the opposite and the robust metric is showing a weakness in the identification of DE genes when there are multiple technical errors? In this case, the Robust metric could help identify “fake” DE genes … but since this behaviour is unique to this dataset I don’t know how relevant it would be to study it, since just a comparison with another dataset (Gupta) could eliminate these false positives
Mean expression plays a role in the value of both metrics, with genes with lower levels of expression having higher values